import jieba
import jieba.analyse
import os
# from ..config.config import *
import pandas as pd
import pymongo
import re

BASE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
STOPWORDS_PATH = BASE_PATH + '/analysis/config/stopwords.txt'
USER_CORPUS = BASE_PATH + '/analysis/config/usercorpus.txt'


def concat_all_text(text_dir):
    all_txt = list()
    for each_txt in os.listdir(text_dir):
        filepath = text_dir + os.path.sep + each_txt
        with open(filepath, mode='rt', encoding='UTF-8') as f:
            text = ''.join(f.readlines())
            all_txt.append(text)
    return ''.join(all_txt)


def get_hot_words(text):
    jieba.analyse.set_stop_words(STOPWORDS_PATH)
    jieba.load_userdict(USER_CORPUS)
    # print(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=()))
    df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=60, withWeight=True, allowPOS=()))
    print(df)
    df.to_excel('./hotwords/cai.xlsx', 'cai')


import os
def mkdirs_if_not_exists(directory_):
    """create a new folder if it does not exist"""
    if not os.path.exists(directory_) or not os.path.isdir(directory_):
        os.makedirs(directory_)


def import_data():
    mongo_uri = 'mongodb://****:*****@0.0.0.0:27017'
    client = pymongo.MongoClient(mongo_uri)
    db = client['cai']
    sum = db.caicai.count()
    limit = 20
    offset = int(sum / limit)
    print(sum)
    details_dir = BASE_PATH + '/analysis/data/' + os.path.sep
    mkdirs_if_not_exists(details_dir)
    for i in range(offset):
        # print(db.caicai.find(["text","id"]).skip(i * 20).limit(limit))
        for u in db.caicai.find().skip(i * 20).limit(limit):
            # df=pd.DataFrame(u)
            print(u['text'])
            content=u['text']
            result = re.sub('<br/>', '', content)
            result1 = re.sub('<a.*?a>', '', result)
            result2 = re.sub('\\\\.\.\.', '', result1)
            result3 = re.sub('<img.*>', '', result2)
            result4 = re.sub('<span.*</span>', '', result3)
            with open(details_dir + str('cai') + '.txt', 'a', encoding='UTF-8') as f:
                f.write(result4+'\n')
            # f = open(details_dir + str('cai') + '.txt', mode='w', encoding='UTF-8')
            # f.write(u['text'])
            # f.flush()
            # f.close()
            # MySQL 的写法： select * from 集合名 limit 2, 3
            # for u in db.users.find().skip(2).limit(3): print u
            # db['caicai'].update({'id': item.get('id')}, {'$set': dict(item)}, True)


if __name__ == '__main__':
    get_hot_words(concat_all_text(BASE_PATH +'/analysis/data/'))
    # import_data()
